Reading packages and setting up theme

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidytuesdayR))
suppressPackageStartupMessages(library(wesanderson))
suppressPackageStartupMessages(library(sjPlot))
suppressPackageStartupMessages(library(car))
suppressPackageStartupMessages(library(performance))
suppressPackageStartupMessages(library(robust))

theme_set(theme_minimal())

Reading data

tuesdata <- tt_load('2021-04-20')
## ---- Compiling #TidyTuesday Information for 2021-04-20 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "netflix_titles.csv"
netflix <- tuesdata$netflix

Exploring and cleaning/recoding variables

Checking data structure and variable values

netflix %>% glimpse()
## Rows: 7,787
## Columns: 12
## $ show_id      <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
## $ type         <chr> "TV Show", "Movie", "Movie", "Movie", "Movie", "TV Show",…
## $ title        <chr> "3%", "7:19", "23:59", "9", "21", "46", "122", "187", "70…
## $ director     <chr> NA, "Jorge Michel Grau", "Gilbert Chan", "Shane Acker", "…
## $ cast         <chr> "João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Val…
## $ country      <chr> "Brazil", "Mexico", "Singapore", "United States", "United…
## $ date_added   <chr> "August 14, 2020", "December 23, 2016", "December 20, 201…
## $ release_year <dbl> 2020, 2016, 2011, 2009, 2008, 2016, 2019, 1997, 2019, 200…
## $ rating       <chr> "TV-MA", "TV-MA", "R", "PG-13", "PG-13", "TV-MA", "TV-MA"…
## $ duration     <chr> "4 Seasons", "93 min", "78 min", "80 min", "123 min", "1 …
## $ listed_in    <chr> "International TV Shows, TV Dramas, TV Sci-Fi & Fantasy",…
## $ description  <chr> "In a future where the elite inhabit an island paradise f…
netflix %>% count(type)
## # A tibble: 2 × 2
##   type        n
##   <chr>   <int>
## 1 Movie    5377
## 2 TV Show  2410
netflix %>% count(release_year)
## # A tibble: 73 × 2
##    release_year     n
##           <dbl> <int>
##  1         1925     1
##  2         1942     2
##  3         1943     3
##  4         1944     3
##  5         1945     3
##  6         1946     2
##  7         1947     1
##  8         1954     2
##  9         1955     3
## 10         1956     2
## # ℹ 63 more rows
netflix %>% reframe(range_release_year = range(release_year)) # 1925-2021
## # A tibble: 2 × 1
##   range_release_year
##                <dbl>
## 1               1925
## 2               2021
netflix %>% count(country)
## # A tibble: 682 × 2
##    country                                                 n
##    <chr>                                               <int>
##  1 Argentina                                              50
##  2 Argentina, Brazil, France, Poland, Germany, Denmark     1
##  3 Argentina, Chile                                        1
##  4 Argentina, Chile, Peru                                  1
##  5 Argentina, France                                       1
##  6 Argentina, France, United States, Germany, Qatar        1
##  7 Argentina, Italy                                        1
##  8 Argentina, Spain                                        8
##  9 Argentina, United States                                1
## 10 Argentina, United States, Mexico                        1
## # ℹ 672 more rows
netflix %>% count(duration)
## # A tibble: 216 × 2
##    duration       n
##    <chr>      <int>
##  1 1 Season    1608
##  2 10 Seasons     6
##  3 10 min         1
##  4 100 min       97
##  5 101 min       96
##  6 102 min       98
##  7 103 min      101
##  8 104 min       89
##  9 105 min       91
## 10 106 min       97
## # ℹ 206 more rows
netflix %>% count(date_added)
## # A tibble: 1,566 × 2
##    date_added              n
##    <chr>               <int>
##  1 " April 15, 2018"       1
##  2 " April 16, 2019"       1
##  3 " April 17, 2016"       1
##  4 " April 20, 2017"       1
##  5 " April 4, 2017"        1
##  6 " August 1, 2017"       1
##  7 " August 13, 2018"      1
##  8 " August 21, 2017"      1
##  9 " August 4, 2017"       3
## 10 " December 1, 2018"     1
## # ℹ 1,556 more rows

Tidying variables

netflix_tidy <- netflix %>% 
  mutate(type = factor(type)) %>% 
  separate(duration, into = c("duration", "duration_units"), sep = " ", convert = TRUE) %>% 
  mutate(
    duration_units = case_when(
      duration_units == "min" ~ "minutes",
      duration_units == "Season" ~ "seasons",
      duration_units == "Seasons" ~ "seasons"
    )
  ) %>%
  mutate(duration_units = factor(duration_units)) %>% 
  mutate(date_added = mdy(date_added),
         year_added = year(date_added)
  )

netflix_tidy %>% count(duration)
## # A tibble: 206 × 2
##    duration     n
##       <int> <int>
##  1        1  1608
##  2        2   382
##  3        3   185
##  4        4    87
##  5        5    59
##  6        6    30
##  7        7    19
##  8        8    19
##  9        9     9
## 10       10     7
## # ℹ 196 more rows
netflix_tidy %>% count(duration_units)
## # A tibble: 2 × 2
##   duration_units     n
##   <fct>          <int>
## 1 minutes         5377
## 2 seasons         2410
netflix_tidy %>% count(date_added)
## # A tibble: 1,513 × 2
##    date_added     n
##    <date>     <int>
##  1 2008-01-01     1
##  2 2008-02-04     1
##  3 2009-05-05     1
##  4 2009-11-18     1
##  5 2010-11-01     1
##  6 2011-05-17     1
##  7 2011-09-27     1
##  8 2011-10-01    11
##  9 2012-02-21     1
## 10 2012-11-14     1
## # ℹ 1,503 more rows
netflix_tidy %>% count(year_added)
## # A tibble: 15 × 2
##    year_added     n
##         <dbl> <int>
##  1       2008     2
##  2       2009     2
##  3       2010     1
##  4       2011    13
##  5       2012     3
##  6       2013    11
##  7       2014    25
##  8       2015    88
##  9       2016   443
## 10       2017  1225
## 11       2018  1685
## 12       2019  2153
## 13       2020  2009
## 14       2021   117
## 15         NA    10

Checking for duplicates

duplicate_show_ids <- netflix_tidy %>%
  filter(duplicated(show_id) | duplicated(show_id, fromLast = TRUE))
  # -> no duplicates found

Checking for and identifying missing values

anyNA(netflix_tidy)
## [1] TRUE
missing_summary <- netflix_tidy %>%
  summarise(
    country_missing = sum(is.na(country)),
    director_missing = sum(is.na(director)),
    cast_missing = sum(is.na(cast))
  )
missing_summary
## # A tibble: 1 × 3
##   country_missing director_missing cast_missing
##             <int>            <int>        <int>
## 1             507             2389          718
missing_percent <- netflix_tidy %>%
  summarise(
    country_missing_pct = mean(is.na(country)) * 100,
    director_missing_pct = mean(is.na(director)) * 100,
    cast_missing_pct = mean(is.na(cast)) * 100
  )
missing_percent
## # A tibble: 1 × 3
##   country_missing_pct director_missing_pct cast_missing_pct
##                 <dbl>                <dbl>            <dbl>
## 1                6.51                 30.7             9.22

Exploratory data analysis

Plotting distributions

Content types

netflix_tidy %>%
  count(type) %>%
  ggplot(aes("", n, fill = type)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(title = "Distribution of content types on Netflix",
       x = NULL, y = NULL,
       fill = "Content type") +
  theme_void()

Content by country

netflix_tidy %>%
  count(country, sort = TRUE) %>%
  filter(n > 100) %>%
  filter(!is.na(country)) %>%
  ggplot(aes(fct_reorder(country, n), n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Content by country on Netflix",
       x = "Country", y = "Number of movies/shows")

Release year by content type

# density plot of release year by type
netflix_tidy %>%
  ggplot(aes(release_year, fill = type)) +
  geom_density(alpha = 0.5) +
  labs(title = "Release year distribution by content type",
       x = "Release year", y = "Density",
       fill = "Content type") +
  facet_wrap(~type, ncol = 1, scales = "free_y")

Duration by genre

netflix_tidy %>% 
  filter(type == "Movie") %>%
  filter(listed_in != "Movies") %>% 
  separate_rows(listed_in, sep = ", ") %>%
  group_by(genre = factor(listed_in)) %>%
  reframe(
    n = n(),
    mean_duration = mean(duration),
  ) %>%
  ggplot(aes(mean_duration, reorder(genre, mean_duration), fill = reorder(genre, mean_duration))) +
  geom_col() +
  theme(legend.position = "none") +
  labs(title = "Average duration of movies by genre",
       x = "Average duration (minutes)", y = "Genre")

Rating by genre

netflix_tidy %>%
  filter(type == "Movie") %>% 
  separate_rows(listed_in, sep = ", ") %>%
  count(genre = listed_in, rating) %>% 
  group_by(genre) %>% 
  mutate(percentage = n / sum(n) * 100) %>% 
  ungroup() %>% 
  ggplot(aes("", percentage, fill = rating)) +
  geom_bar(stat = "identity", width = 1, color = "white", size = 0.25) +
  coord_polar("y", start = 0) +
  facet_wrap(~genre) +
  labs(
    title = "Ratings Distribution in Each Genre",
    fill = "Rating"
  ) +
  theme_void() +
  theme(strip.text = element_text(size = 10, face = "bold"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Hypothesis testing

Hypothesis: The duration of movies has decreased over time due to the popularity of bite-sized content. I also included rating as a predictor to see if certain ratings are more affected by this trend than others.

Data preparation

duration_trend <- netflix_tidy %>%
  filter(!is.na(duration), !is.na(release_year), !is.na(type), !is.na(rating)) %>%
  filter(type == "Movie")

Buiding the complex model

duration_trend_model_complex <- lm(duration ~ release_year * rating, data = duration_trend)
summary(duration_trend_model_complex)
## 
## Call:
## lm(formula = duration ~ release_year * rating, data = duration_trend)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.818  -13.785   -1.063   13.602  217.062 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.625e+03  4.570e+02   3.555 0.000381 ***
## release_year                -7.690e-01  2.287e-01  -3.363 0.000777 ***
## ratingNC-17                  3.060e+04  1.370e+04   2.233 0.025592 *  
## ratingNR                    -1.571e+03  7.172e+02  -2.191 0.028521 *  
## ratingPG                    -6.363e+02  5.461e+02  -1.165 0.244011    
## ratingPG-13                 -6.998e+02  5.407e+02  -1.294 0.195673    
## ratingR                     -1.032e+03  4.989e+02  -2.068 0.038645 *  
## ratingTV-14                 -1.617e+02  4.754e+02  -0.340 0.733700    
## ratingTV-G                  -4.972e+02  7.752e+02  -0.641 0.521315    
## ratingTV-MA                 -1.134e+03  4.963e+02  -2.285 0.022342 *  
## ratingTV-PG                 -3.934e+02  4.911e+02  -0.801 0.423087    
## ratingTV-Y                   1.841e+03  1.545e+03   1.192 0.233440    
## ratingTV-Y7                 -5.375e+03  1.465e+03  -3.668 0.000247 ***
## ratingTV-Y7-FV              -7.601e+03  1.047e+04  -0.726 0.467824    
## ratingUR                    -1.701e+03  1.626e+03  -1.046 0.295425    
## release_year:ratingNC-17    -1.516e+01  6.800e+00  -2.229 0.025834 *  
## release_year:ratingNR        7.898e-01  3.576e-01   2.209 0.027250 *  
## release_year:ratingPG        3.260e-01  2.728e-01   1.195 0.232245    
## release_year:ratingPG-13     3.623e-01  2.702e-01   1.341 0.180014    
## release_year:ratingR         5.270e-01  2.494e-01   2.113 0.034617 *  
## release_year:ratingTV-14     9.686e-02  2.378e-01   0.407 0.683737    
## release_year:ratingTV-G      2.492e-01  3.857e-01   0.646 0.518191    
## release_year:ratingTV-MA     5.729e-01  2.480e-01   2.310 0.020936 *  
## release_year:ratingTV-PG     2.041e-01  2.455e-01   0.831 0.405876    
## release_year:ratingTV-Y     -9.260e-01  7.664e-01  -1.208 0.227014    
## release_year:ratingTV-Y7     2.660e+00  7.276e-01   3.656 0.000259 ***
## release_year:ratingTV-Y7-FV  3.769e+00  5.196e+00   0.725 0.468236    
## release_year:ratingUR        8.605e-01  8.120e-01   1.060 0.289346    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.43 on 5344 degrees of freedom
## Multiple R-squared:  0.2092, Adjusted R-squared:  0.2052 
## F-statistic: 52.37 on 27 and 5344 DF,  p-value: < 2.2e-16
tab_model(duration_trend_model_complex, show.aic = TRUE)
  duration
Predictors Estimates CI p
(Intercept) 1624.89 728.95 – 2520.82 <0.001
release year -0.77 -1.22 – -0.32 0.001
ratingNC-17 30596.18 3734.59 – 57457.77 0.026
rating [NR] -1571.16 -2977.20 – -165.12 0.029
rating [PG] -636.30 -1706.91 – 434.30 0.244
ratingPG-13 -699.78 -1759.84 – 360.27 0.196
rating [R] -1031.90 -2009.90 – -53.90 0.039
ratingTV-14 -161.75 -1093.77 – 770.27 0.734
rating [TV-G] -497.19 -2016.93 – 1022.54 0.521
rating [TV-MA] -1134.11 -2107.04 – -161.17 0.022
rating [TV-PG] -393.43 -1356.17 – 569.31 0.423
rating [TV-Y] 1840.82 -1187.48 – 4869.12 0.233
rating [TV-Y7] -5374.62 -8246.97 – -2502.27 <0.001
rating [TV-Y7-FV] -7601.49 -28125.52 – 12922.54 0.468
rating [UR] -1701.18 -4888.32 – 1485.95 0.295
release_year:ratingNC-17 -15.16 -28.49 – -1.83 0.026
release year × rating
[NR]
0.79 0.09 – 1.49 0.027
release year × rating
[PG]
0.33 -0.21 – 0.86 0.232
release_year:ratingPG-13 0.36 -0.17 – 0.89 0.180
release year × rating [R] 0.53 0.04 – 1.02 0.035
release_year:ratingTV-14 0.10 -0.37 – 0.56 0.684
release year × rating
[TV-G]
0.25 -0.51 – 1.01 0.518
release year × rating
[TV-MA]
0.57 0.09 – 1.06 0.021
release year × rating
[TV-PG]
0.20 -0.28 – 0.69 0.406
release year × rating
[TV-Y]
-0.93 -2.43 – 0.58 0.227
release year × rating
[TV-Y7]
2.66 1.23 – 4.09 <0.001
release year × rating
[TV-Y7-FV]
3.77 -6.42 – 13.95 0.468
release year × rating
[UR]
0.86 -0.73 – 2.45 0.289
Observations 5372
R2 / R2 adjusted 0.209 / 0.205
AIC 50041.481

Building the simple model

duration_trend_model_simple <- lm(duration ~ release_year, data = duration_trend)
summary(duration_trend_model_simple)
## 
## Call:
## lm(formula = duration ~ release_year, data = duration_trend)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -124.187  -13.468   -0.655   14.532  215.740 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1315.74682   79.33185   16.59   <2e-16 ***
## release_year   -0.60430    0.03941  -15.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.92 on 5370 degrees of freedom
## Multiple R-squared:  0.04195,    Adjusted R-squared:  0.04177 
## F-statistic: 235.1 on 1 and 5370 DF,  p-value: < 2.2e-16
tab_model(duration_trend_model_simple, show.aic = TRUE)
  duration
Predictors Estimates CI p
(Intercept) 1315.75 1160.22 – 1471.27 <0.001
release year -0.60 -0.68 – -0.53 <0.001
Observations 5372
R2 / R2 adjusted 0.042 / 0.042
AIC 51020.368

Model diagnostics

Checking for influential outliers

plot(duration_trend_model_complex, which = 4)

cooks_distance <- cooks.distance(duration_trend_model_complex)
influential_points <- which(cooks_distance > (4 / nrow(duration_trend)))
influential_points
##  119  121  128  159  176  182  297  337  359  414  415  466  477  518  558  559 
##  119  121  128  159  176  182  297  337  359  414  415  466  477  518  558  559 
##  561  562  573  597  628  691  694  704  717  742  828  846  850  859  878  882 
##  561  562  573  597  628  691  694  704  717  742  828  846  850  859  878  882 
##  903  936  951  955  968 1021 1069 1144 1147 1167 1198 1223 1260 1347 1385 1504 
##  903  936  951  955  968 1021 1069 1144 1147 1167 1198 1223 1260 1347 1385 1504 
## 1507 1508 1534 1556 1560 1617 1652 1653 1691 1715 1723 1739 1816 1912 1941 1980 
## 1507 1508 1534 1556 1560 1617 1652 1653 1691 1715 1723 1739 1816 1912 1941 1980 
## 1985 1986 2044 2055 2118 2167 2226 2250 2279 2381 2405 2435 2461 2467 2469 2472 
## 1985 1986 2044 2055 2118 2167 2226 2250 2279 2381 2405 2435 2461 2467 2469 2472 
## 2485 2495 2496 2497 2498 2508 2510 2511 2512 2514 2515 2518 2525 2563 2564 2566 
## 2485 2495 2496 2497 2498 2508 2510 2511 2512 2514 2515 2518 2525 2563 2564 2566 
## 2569 2583 2592 2602 2660 2688 2698 2700 2714 2746 2769 2841 2918 2925 2930 2946 
## 2569 2583 2592 2602 2660 2688 2698 2700 2714 2746 2769 2841 2918 2925 2930 2946 
## 2994 3082 3113 3117 3174 3219 3221 3225 3230 3346 3366 3408 3411 3415 3416 3432 
## 2994 3082 3113 3117 3174 3219 3221 3225 3230 3346 3366 3408 3411 3415 3416 3432 
## 3434 3436 3437 3438 3439 3442 3449 3451 3455 3470 3476 3520 3532 3548 3556 3666 
## 3434 3436 3437 3438 3439 3442 3449 3451 3455 3470 3476 3520 3532 3548 3556 3666 
## 3720 3724 3725 3735 3740 3780 3784 3851 3861 3886 3888 3899 3912 3922 3933 4012 
## 3720 3724 3725 3735 3740 3780 3784 3851 3861 3886 3888 3899 3912 3922 3933 4012 
## 4060 4075 4079 4102 4118 4121 4132 4133 4249 4265 4278 4284 4324 4344 4505 4527 
## 4060 4075 4079 4102 4118 4121 4132 4133 4249 4265 4278 4284 4324 4344 4505 4527 
## 4579 4580 4589 4600 4614 4635 4636 4637 4705 4747 4750 4761 4801 4854 4892 4903 
## 4579 4580 4589 4600 4614 4635 4636 4637 4705 4747 4750 4761 4801 4854 4892 4903 
## 4912 4913 4921 4938 4942 4999 5006 5010 5029 5075 5158 5174 5230 5246 5254 5259 
## 4912 4913 4921 4938 4942 4999 5006 5010 5029 5075 5158 5174 5230 5246 5254 5259 
## 5297 5299 5309 5340 5341 5353 5361 
## 5297 5299 5309 5340 5341 5353 5361
# removing influential outliers
duration_trend_no_outliers <- duration_trend %>%
  slice(-influential_points)

# re-running both models without influential outliers
duration_trend_model_complex_no_outliers <- lm(duration ~ release_year * rating, data = duration_trend_no_outliers)
summary(duration_trend_model_complex_no_outliers)
## 
## Call:
## lm(formula = duration ~ release_year * rating, data = duration_trend_no_outliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -97.578 -12.733  -0.788  13.056  98.137 
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                873.1014   673.7291   1.296 0.195060    
## release_year                -0.3946     0.3367  -1.172 0.241315    
## ratingNR                  -838.9893  1011.2287  -0.830 0.406762    
## ratingPG                  -257.2059   736.2109  -0.349 0.726830    
## ratingPG-13               -376.7710   742.5183  -0.507 0.611880    
## ratingR                   -212.0949   701.2285  -0.302 0.762312    
## ratingTV-14               1465.8605   690.7539   2.122 0.033876 *  
## ratingTV-G               -3424.4951  1961.5906  -1.746 0.080910 .  
## ratingTV-MA               -352.6469   706.6159  -0.499 0.617755    
## ratingTV-PG               1298.1721   720.8872   1.801 0.071793 .  
## ratingTV-Y                8442.6682  2287.5927   3.691 0.000226 ***
## ratingTV-Y7              -5200.0283  2095.6263  -2.481 0.013120 *  
## ratingUR                    32.2530    23.5451   1.370 0.170796    
## release_year:ratingNR        0.4242     0.5039   0.842 0.399918    
## release_year:ratingPG        0.1365     0.3677   0.371 0.710403    
## release_year:ratingPG-13     0.2010     0.3708   0.542 0.587808    
## release_year:ratingR         0.1187     0.3503   0.339 0.734786    
## release_year:ratingTV-14    -0.7125     0.3451  -2.064 0.039027 *  
## release_year:ratingTV-G      1.6998     0.9735   1.746 0.080849 .  
## release_year:ratingTV-MA     0.1836     0.3529   0.520 0.602861    
## release_year:ratingTV-PG    -0.6368     0.3600  -1.769 0.076982 .  
## release_year:ratingTV-Y     -4.2001     1.1345  -3.702 0.000216 ***
## release_year:ratingTV-Y7     2.5737     1.0402   2.474 0.013382 *  
## release_year:ratingUR            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.98 on 5134 degrees of freedom
## Multiple R-squared:  0.2318, Adjusted R-squared:  0.2285 
## F-statistic:  70.4 on 22 and 5134 DF,  p-value: < 2.2e-16
tab_model(duration_trend_model_complex_no_outliers, show.aic = TRUE)
## Model matrix is rank deficient. Parameters `release_year:ratingUR` were
##   not estimable.
  duration
Predictors Estimates CI p
(Intercept) 873.10 -447.69 – 2193.90 0.195
release year -0.39 -1.05 – 0.27 0.241
rating [NR] -838.99 -2821.43 – 1143.45 0.407
rating [PG] -257.21 -1700.49 – 1186.08 0.727
ratingPG-13 -376.77 -1832.42 – 1078.88 0.612
rating [R] -212.09 -1586.80 – 1162.61 0.762
ratingTV-14 1465.86 111.69 – 2820.03 0.034
rating [TV-G] -3424.50 -7270.05 – 421.06 0.081
rating [TV-MA] -352.65 -1737.92 – 1032.62 0.618
rating [TV-PG] 1298.17 -115.07 – 2711.42 0.072
rating [TV-Y] 8442.67 3958.01 – 12927.32 <0.001
rating [TV-Y7] -5200.03 -9308.35 – -1091.71 0.013
rating [UR] 32.25 -13.91 – 78.41 0.171
release year × rating
[NR]
0.42 -0.56 – 1.41 0.400
release year × rating
[PG]
0.14 -0.58 – 0.86 0.710
release_year:ratingPG-13 0.20 -0.53 – 0.93 0.588
release year × rating [R] 0.12 -0.57 – 0.81 0.735
release_year:ratingTV-14 -0.71 -1.39 – -0.04 0.039
release year × rating
[TV-G]
1.70 -0.21 – 3.61 0.081
release year × rating
[TV-MA]
0.18 -0.51 – 0.88 0.603
release year × rating
[TV-PG]
-0.64 -1.34 – 0.07 0.077
release year × rating
[TV-Y]
-4.20 -6.42 – -1.98 <0.001
release year × rating
[TV-Y7]
2.57 0.53 – 4.61 0.013
Observations 5157
R2 / R2 adjusted 0.232 / 0.228
AIC 46990.286
duration_trend_model_simple_no_outliers <- lm(duration ~ release_year, data = duration_trend_no_outliers)
summary(duration_trend_model_simple_no_outliers)
## 
## Call:
## lm(formula = duration ~ release_year, data = duration_trend_no_outliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.495 -12.859  -0.859  13.929 110.048 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1686.30324   90.90469   18.55   <2e-16 ***
## release_year   -0.78802    0.04514  -17.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.42 on 5155 degrees of freedom
## Multiple R-squared:  0.05581,    Adjusted R-squared:  0.05563 
## F-statistic: 304.7 on 1 and 5155 DF,  p-value: < 2.2e-16
tab_model(duration_trend_model_simple_no_outliers, show.aic = TRUE)
  duration
Predictors Estimates CI p
(Intercept) 1686.30 1508.09 – 1864.52 <0.001
release year -0.79 -0.88 – -0.70 <0.001
Observations 5157
R2 / R2 adjusted 0.056 / 0.056
AIC 48011.853
# The model matrix became rank deficient and some paramteres were not estimable. Therefore, I decided not to remove outliers from the model, as they likely represent valid aspects of the data.

Checking normality

Complex model

qqnorm(residuals(duration_trend_model_complex))
qqline(residuals(duration_trend_model_complex), col = "red")

# looks fairly normal

# too many observations (>5000) to run a Shapiro-wilk test

Simple model

qqnorm(residuals(duration_trend_model_simple))
qqline(residuals(duration_trend_model_simple), col = "red")

# looks fairly normal

# too many observations (>5000) to run a Shapiro-wilk test

Checking linearity

Both models (only one continuous variable)

# crPlots(duration_trend_model_complex) -> not available for interactions

crPlots(duration_trend_model_simple)

# looks farily linear

Checking homoscedasticity

Complex model

plot(fitted(duration_trend_model_complex), residuals(duration_trend_model_complex), main = "Residuals vs Fitted", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")

plot(duration_trend_model_complex, which = 3)

check_heteroskedasticity(duration_trend_model_complex)
## Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).
  # the assumption of homoscedasticity does not seem to be met -> attemtping the log-transformation of the dependent variable

Simple model

plot(fitted(duration_trend_model_simple), residuals(duration_trend_model_simple), main = "Residuals vs Fitted", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")

plot(duration_trend_model_simple, which = 3)

check_heteroskedasticity(duration_trend_model_simple)
## Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).
  # the assumption of homoscedasticity does not seem to be met

Log-transforming the dependent variable

duration_trend <- duration_trend %>%
  mutate(duration_log = log(duration))

Re-running the model with log-transformed dependent variable and checking homoscedasticity

duration_trend_model_complex_log <- lm(duration_log ~ release_year * rating, data = duration_trend)
summary(duration_trend_model_complex_log)
## 
## Call:
## lm(formula = duration_log ~ release_year * rating, data = duration_trend)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.01366 -0.11533  0.02341  0.16911  1.22178 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.407e+01  5.517e+00   4.362 1.31e-05 ***
## release_year                -9.832e-03  2.761e-03  -3.562 0.000372 ***
## ratingNC-17                  2.467e+02  1.654e+02   1.491 0.135934    
## ratingNR                    -1.947e+01  8.658e+00  -2.249 0.024539 *  
## ratingPG                    -1.040e+01  6.592e+00  -1.578 0.114724    
## ratingPG-13                 -1.200e+01  6.528e+00  -1.838 0.066142 .  
## ratingR                     -1.526e+01  6.022e+00  -2.535 0.011285 *  
## ratingTV-14                 -9.698e+00  5.739e+00  -1.690 0.091122 .  
## ratingTV-G                  -1.382e+01  9.358e+00  -1.477 0.139699    
## ratingTV-MA                 -1.789e+01  5.991e+00  -2.986 0.002835 ** 
## ratingTV-PG                 -8.547e+00  5.928e+00  -1.442 0.149424    
## ratingTV-Y                   4.135e+01  1.865e+01   2.217 0.026632 *  
## ratingTV-Y7                 -9.189e+01  1.769e+01  -5.196 2.12e-07 ***
## ratingTV-Y7-FV              -1.154e+02  1.264e+02  -0.913 0.361359    
## ratingUR                    -2.106e+01  1.963e+01  -1.073 0.283326    
## release_year:ratingNC-17    -1.222e-01  8.209e-02  -1.488 0.136766    
## release_year:ratingNR        9.806e-03  4.317e-03   2.272 0.023153 *  
## release_year:ratingPG        5.306e-03  3.294e-03   1.611 0.107253    
## release_year:ratingPG-13     6.145e-03  3.261e-03   1.884 0.059578 .  
## release_year:ratingR         7.769e-03  3.010e-03   2.581 0.009889 ** 
## release_year:ratingTV-14     5.007e-03  2.870e-03   1.745 0.081108 .  
## release_year:ratingTV-G      6.882e-03  4.656e-03   1.478 0.139467    
## release_year:ratingTV-MA     9.013e-03  2.994e-03   3.010 0.002622 ** 
## release_year:ratingTV-PG     4.340e-03  2.964e-03   1.465 0.143104    
## release_year:ratingTV-Y     -2.076e-02  9.252e-03  -2.244 0.024846 *  
## release_year:ratingTV-Y7     4.548e-02  8.783e-03   5.178 2.33e-07 ***
## release_year:ratingTV-Y7-FV  5.721e-02  6.272e-02   0.912 0.361751    
## release_year:ratingUR        1.066e-02  9.802e-03   1.088 0.276809    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.307 on 5344 degrees of freedom
## Multiple R-squared:  0.2476, Adjusted R-squared:  0.2438 
## F-statistic: 65.14 on 27 and 5344 DF,  p-value: < 2.2e-16
tab_model(duration_trend_model_complex_log)
  duration log
Predictors Estimates CI p
(Intercept) 24.07 13.25 – 34.88 <0.001
release year -0.01 -0.02 – -0.00 <0.001
ratingNC-17 246.67 -77.59 – 570.94 0.136
rating [NR] -19.47 -36.45 – -2.50 0.025
rating [PG] -10.40 -23.32 – 2.52 0.115
ratingPG-13 -12.00 -24.79 – 0.80 0.066
rating [R] -15.26 -27.07 – -3.46 0.011
ratingTV-14 -9.70 -20.95 – 1.55 0.091
rating [TV-G] -13.82 -32.17 – 4.52 0.140
rating [TV-MA] -17.89 -29.64 – -6.15 0.003
rating [TV-PG] -8.55 -20.17 – 3.07 0.149
rating [TV-Y] 41.35 4.79 – 77.91 0.027
rating [TV-Y7] -91.89 -126.57 – -57.22 <0.001
rating [TV-Y7-FV] -115.37 -363.13 – 132.39 0.361
rating [UR] -21.06 -59.53 – 17.42 0.283
release_year:ratingNC-17 -0.12 -0.28 – 0.04 0.137
release year × rating
[NR]
0.01 0.00 – 0.02 0.023
release year × rating
[PG]
0.01 -0.00 – 0.01 0.107
release_year:ratingPG-13 0.01 -0.00 – 0.01 0.060
release year × rating [R] 0.01 0.00 – 0.01 0.010
release_year:ratingTV-14 0.01 -0.00 – 0.01 0.081
release year × rating
[TV-G]
0.01 -0.00 – 0.02 0.139
release year × rating
[TV-MA]
0.01 0.00 – 0.01 0.003
release year × rating
[TV-PG]
0.00 -0.00 – 0.01 0.143
release year × rating
[TV-Y]
-0.02 -0.04 – -0.00 0.025
release year × rating
[TV-Y7]
0.05 0.03 – 0.06 <0.001
release year × rating
[TV-Y7-FV]
0.06 -0.07 – 0.18 0.362
release year × rating
[UR]
0.01 -0.01 – 0.03 0.277
Observations 5372
R2 / R2 adjusted 0.248 / 0.244
plot(fitted(duration_trend_model_complex_log), residuals(duration_trend_model_complex_log), main = "Residuals vs Fitted", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")

check_heteroskedasticity(duration_trend_model_complex_log)
## Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).
  # the assumption of homoscedasticity still does not seem to be met
  # I tried running a robust regression with the "robust" package, but ran into multiple errors, so I will continue with the model as is while keeping in mind its limitations.

Multicollinearity

# I will skip this step since there is only one continuous predictor in the model.

Comparing models

anova(duration_trend_model_simple, duration_trend_model_complex)
## Analysis of Variance Table
## 
## Model 1: duration ~ release_year
## Model 2: duration ~ release_year * rating
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1   5370 4186702                                  
## 2   5344 3455660 26    731043 43.481 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The likelihood ratio test comparing the two models indicated a significant improvement in fit for the complex model compared to the simple model (F(26) = 43.481, p < 0.001)

summary(duration_trend_model_simple)$adj.r.squared # 0.041
## [1] 0.04176801
summary(duration_trend_model_complex)$adj.r.squared # 0.205
## [1] 0.2052375
# The complex model explains significantly more variance than the simple model.

# AIC comparison
AIC(duration_trend_model_simple, duration_trend_model_complex)
##                              df      AIC
## duration_trend_model_simple   3 51020.37
## duration_trend_model_complex 29 50041.48
# The more complex model has a lower AIC value (rule of thumb: if a model is 2 AIC units lower than the other, it is considered significantly better).

Plotting

Simple model without ratings

duration_trend %>%
  ggplot(aes(release_year, duration)) +
  geom_jitter(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Duration of movies over time by rating",
       x = "Release year", y = "Duration (minutes)")
## `geom_smooth()` using formula = 'y ~ x'

Complex model with ratings

duration_trend %>%
  ggplot(aes(release_year, duration, color = rating)) +
  geom_jitter(alpha = 0.2) +
  geom_smooth(method = "lm", lwd = 1.5, se = FALSE) +
  labs(title = "Duration of movies over time by rating",
       x = "Release year", y = "Duration (minutes)",
       color = "Rating") +
  scale_color_viridis_d()
## `geom_smooth()` using formula = 'y ~ x'

Interpretation and conclusions

There is a significant negative coefficient for release year, suggesting that, on average, the duration of movies have decreased over time. Each additional year corresponds to an approximate decrease in duration by 0.77 minutes. This confirms our hypothesis on the decrease in duration of movies over time.

Several ratings are significantly associated with duration. For example, movies with the rating NC-17 (No One 17 and Under Admitted) have a much higher duration than the reference rating (G = General Audiences), with an increase of 30,600 minutes on average.

Many of the interaction terms are significant, indicating that the duration change over time varies across ratings.

Even the more complex model explains only about 20.5% of the variance in the data, indicating that other variables not included in the model (such as the genre, popularity, or platforms) may have a strong impact on movie duration.

The model does not meet the assumption of homoscedasticity, which may affect the reliability of the results. I attempted to log-transform the dependent variable to address this issue, but the assumption was still not met. I also attempted to run a robust regression, but ran into multiple errors. Therefore, these results should be interpreted with caution.